import scrapy
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
from tieba.items import TiebaItem

##知乎已改成ajax请求数据，该项目无效。可到如下地址获取json

#https://www.zhihu.com/api/v4/topics/19552832/feeds/essence?limit=10&offset=0

class tieba(CrawlSpider):
    name = 'tieba'
    start_urls = ['https://www.zhihu.com/topic/19552832/top-answers?page=1']

    def parse(self, response):
        item = TiebaItem()
        selector = Selector(response)
        infos = selector.xpath('//div[@class="List-item TopicFeedItem"]')
        for info in infos:
            try:
                question = info.xpath('div/h2/div/a/text()').extract()[0].strip()
                favour = info.xpath('div[@class="ContentItem-actions"]/span/button[1]/text()')
                user = info.xpath('div/div[1]/div/div/div[1]/span/div/div/a/text()').extract()[0]
                user_info = info.xpath('div[@class="AuthorInfo-detail"]/div/div/text()').extract()[0].strip()
                content = info.xpath('div[@class="RichContent-inner"]/span/text()').extract()[0].strip()

                item['question'] = question
                item['favour'] = favour
                item['user'] = user
                item['user_info'] = user_info
                item['content'] = content

                yield item
            except IndexError:
                pass

        urls = ['https://www.zhihu.com/topic/19552832/top-answers?page={}'.format(str(i)) for i in range(2,3)]
        for url in urls:
            yield Request(url,callback=self.parse)